start_date <- "2017-01-01"
end_date <- "2019-12-31"
f1<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(d2, d1, units="weeks")))
}
f2<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(as.Date(d2)
, as.Date(d1), units = "weeks")))
}
m1<-microbenchmark(
Nocast = f1(end_date, start_date),
Cast = f2(end_date, start_date),
times = 1000
)
print(m1)
## Unit: microseconds
## expr min lq mean median uq max neval
## Nocast 368.988 376.9135 401.1525 380.7910 392.202 4086.296 1000
## Cast 125.033 129.4370 137.7458 130.9895 134.892 2715.518 1000
fbox_plot(m1, "microseconds")
no_size <- function (n){
x <- c()
for (i in seq(n)) {
x <- c(x, i)
}
}
explicit_size <- function (n){
x <- vector("integer", n)
for (i in seq(n)) {
x[i] <- i
}
}
m3 <- microbenchmark(
no_size = no_size(1e4),
explicit_size = explicit_size(1e4),
times = 10
)
print(m3)
## Unit: microseconds
## expr min lq mean median uq max
## no_size 68810.468 69096.43 74809.0331 70898.133 74436.928 101232.712
## explicit_size 327.411 331.95 659.0558 366.073 371.934 3217.043
## neval
## 10
## 10
fbox_plot(m3, "microseconds")
vector <- runif(1e8)
w1 <- function(x){
d <- length(which(x > .5))
}
w2 <- function(x){
d <- sum(x > .5)
}
m4 <- microbenchmark(
which = w1(vector),
nowhich = w2(vector),
times = 10
)
print(m4)
## Unit: milliseconds
## expr min lq mean median uq max neval
## which 628.4986 630.2693 662.1551 631.4521 633.1820 850.1048 10
## nowhich 218.0632 220.0816 222.1767 223.0102 223.3368 225.1237 10
fbox_plot(m4, "miliseconds")
n <- 1e4
dt <- data.table(
a = seq(n), b = runif(n)
)
v1 <- function(dt){
d <- mean(dt[dt$b > .5, ]$a)
}
v2 <- function(dt){
d <- mean(dt$a[dt$b > .5])
}
m5 <- microbenchmark(
row_operation = v1(dt),
column_operation = v2(dt),
times = 10
)
print(m5)
## Unit: microseconds
## expr min lq mean median uq max neval
## row_operation 193.361 196.326 971.7363 210.4925 248.074 5700.458 10
## column_operation 73.026 78.676 335.9857 82.7335 89.406 2512.659 10
fbox_plot(m5, "microseconds")
The function seq prevents when the second part of the 1:x is zero
num <- 1e7
s1 <- function(num){
d <- mean(1:num)
}
s2 <- function(num){
d <- mean(seq(num))
}
m6<-microbenchmark(
noseq = s1(num),
seq = s2(num),
times = 30
)
print(m6)
## Unit: milliseconds
## expr min lq mean median uq max neval
## noseq 69.69519 69.97604 70.04254 69.99300 70.01719 71.82410 30
## seq 69.61423 69.98628 70.06606 69.99879 70.03862 71.64366 30
fbox_plot(m6, "miliseconds")
large_dataset <- data.table(
id = 1:1000000,
value = sample(letters, 1000000, replace = TRUE)
)
a1 <- function(x){
d <- x |> mutate(code = paste0(id, "_", value))
}
a2 <- function(x){
d <- x |> mutate(code = glue("{id}_{value}"))
}
m7 <- microbenchmark(
with_paste = a1(large_dataset),
with_glue = a2(large_dataset),
times = 20
)
print(m7)
## Unit: milliseconds
## expr min lq mean median uq max neval
## with_paste 562.4149 571.3529 580.9086 576.0443 581.9039 666.3175 20
## with_glue 585.6863 589.0650 612.7230 590.4762 594.3041 998.0038 20
fbox_plot(m7, "miliseconds")
# Create a large list
big_list <- replicate(1e5, rnorm(10), simplify = FALSE)
m8 <- microbenchmark(
lapply = lapply(big_list, mean),
for_loop = {
result <- list()
for (i in seq_along(big_list)) {
result[[i]] <- mean(big_list[[i]])
}
},
times = 10
)
print(m8)
## Unit: milliseconds
## expr min lq mean median uq max neval
## lapply 316.7702 319.2393 335.3730 331.7764 348.6938 363.4554 10
## for_loop 345.7628 350.9287 386.2824 368.5049 401.2409 519.6330 10
fbox_plot(m8, "miliseconds")
dt <- data.table(
Date = as.Date('2023-01-01') + 0:99999,
iDate = as.IDate('2023-01-01') + 0:99999,
Value = rnorm(100000)
)
nd1 <- as.Date('2023-01-01')
nd2 <- as.Date('2023-01-10')
id1 <- as.IDate('2023-01-01')
id2 <- as.IDate('2023-01-10')
date_between_operation <- function(nd1, nd2) {
dt |> filter(Date >= nd1 & Date <= nd2)
}
idate_between_operation <- function(id1, id2) {
dt |> _[data.table::between(iDate, id1, id2)]
}
m9 <- microbenchmark(
Date = date_between_operation(nd1, nd2),
iDate = idate_between_operation(id1, id2),
times = 200L
)
print(m9)
## Unit: microseconds
## expr min lq mean median uq max neval
## Date 1459.513 1515.393 1893.2597 1753.6175 2020.3000 11772.369 200
## iDate 570.534 611.176 721.0323 632.4755 798.3795 2240.191 200
fbox_plot(m9, "miliseconds")
switch_function <- function(x) {
switch(x,
"a" = "apple",
"b" = "banana",
"c" = "cherry",
"default")
}
case_when_function <- function(x) {
case_when(
x == "a" ~ "apple",
x == "b" ~ "banana",
x == "c" ~ "cherry",
TRUE ~ "default"
)
}
# Create a vector of test values
test_values <- sample(c("a", "b", "c", "d"), 1000, replace = TRUE)
m10 <- microbenchmark(
switch = sapply(test_values, switch_function),
case_when = sapply(test_values, case_when_function),
times = 200L
)
print(m10)
## Unit: microseconds
## expr min lq mean median uq max
## switch 633.532 647.7385 678.8005 656.17 673.312 2165.25
## case_when 229197.459 243041.3725 246470.9864 245410.62 249228.503 387248.51
## neval
## 200
## 200
fbox_plot(m10, "microseconds")
set.seed(123)
n <- 1e6
data <- data.table(
id = seq(n),
value = sample(seq(100), n, replace = TRUE)
)
casewhenf <- function(data){
df <- data |>
mutate(category = case_when(
value <= 20 ~ "Low",
value <= 70 ~ "Medium",
value > 70 ~ "High"))
}
fcasef <- function(data){
df <- data |>
mutate(category = fcase(
value <= 20, "Low",
value <= 70, "Medium",
value > 70, "High"))
}
m11 <- microbenchmark(
case_when = casewhenf(data),
fcase = fcasef(data),
times = 20
)
print(m11)
## Unit: milliseconds
## expr min lq mean median uq max neval
## case_when 56.36157 56.91179 60.62484 61.08828 62.99643 73.42582 20
## fcase 20.66431 20.88550 24.21154 21.55173 23.36648 46.32106 20
fbox_plot(m11, "miliseconds")
set.seed(123)
DT <- data.table(
ID = 1:1e6,
Value1 = sample(c(NA, 1:100), 1e6, replace = TRUE),
Value2 = sample(c(NA, 101:200), 1e6, replace = TRUE)
)
# Define the functions
replace_na_f <- function(data){
DF <- data |>
mutate(Value1 = replace_na(Value1, 0),
Value2 = replace_na(Value2, 0)) |>
as.data.table()
}
fcoalesce_f <- function(data){
DF <- data |>
mutate(Value1 = fcoalesce(Value1, 0L),
Value2 = fcoalesce(Value2, 0L))
}
m12 <- microbenchmark(
treplace_na = replace_na_f(DT),
tfcoalesce = fcoalesce_f(DT),
times = 20
)
print(m12)
## Unit: milliseconds
## expr min lq mean median uq max neval
## treplace_na 7.388027 7.471763 7.979352 7.718609 8.288047 10.21760 20
## tfcoalesce 1.556244 1.731797 2.867601 1.879382 2.405153 18.01402 20
fbox_plot(m12, "miliseconds")
dt <- data.table(field_name = c("argentina.blue.man.watch",
"brazil.red.woman.shoes",
"canada.green.kid.hat",
"denmark.red.man.shirt"))
# Filter rows where 'field_name' does not contain 'red'
dtnot <- function(data){
filtered_dt <- data |> _[!grepl("red", field_name)]
}
anonymousnot <- function(data){
filtered_dt <- data |> (\(dt) dt[!grepl("red", dt$field_name), ])()
}
dplyrnot <- function(data){
filtered_dt <- data |> filter(!grepl("red", field_name))
}
m13 <- microbenchmark(
anonymous_not = anonymousnot(dt),
data_table_not = dtnot(dt),
dplyr_not = dplyrnot(dt),
times = 100
)
print(m13)
## Unit: microseconds
## expr min lq mean median uq max neval
## anonymous_not 103.544 110.6565 157.4315 121.111 142.1450 3050.883 100
## data_table_not 100.528 106.5985 140.5623 115.831 137.9675 1811.130 100
## dplyr_not 683.585 702.4205 772.8657 723.791 747.5755 3021.749 100
fbox_plot(m13, "microseconds")
large_data <- data.table(
id = 1:100000,
var1 = rnorm(100000),
var2 = rnorm(100000),
var3 = rnorm(100000),
var4 = rnorm(100000)
)
# Benchmarking
m14 <- microbenchmark(
tidyr_pivot_longer = {
long_data_tidyr <- pivot_longer(large_data, cols = starts_with("var"),
names_to = "variable", values_to = "value")
},
data_table_melt = {
long_data_dt <- melt(large_data, id.vars = "id", variable.name = "variable",
value.name = "value")
},
times = 10
)
print(m14)
## Unit: microseconds
## expr min lq mean median uq max
## tidyr_pivot_longer 6639.901 6892.312 9591.4079 7025.8465 7106.542 33021.823
## data_table_melt 407.090 496.948 581.8058 572.0675 709.063 801.385
## neval
## 10
## 10
fbox_plot(m14, "microseconds")
vec1 <- seq(1000)
vec2 <- seq(1000)
# Define functions to be benchmarked
expand_grid_func <- function() {
return(expand_grid(vec1, vec2))
}
CJ_func <- function() {
return(CJ(vec1, vec2))
}
# Perform benchmarking
m15 <- microbenchmark(
expand_grid = expand_grid_func(),
CJ = CJ_func(),
times = 10
)
print(m15)
## Unit: microseconds
## expr min lq mean median uq max neval
## expand_grid 2219.372 2261.751 2521.4544 2351.548 2386.073 3759.155 10
## CJ 455.861 463.785 649.7245 478.208 497.789 1836.868 10
fbox_plot(m15, "microseconds")
# Sample data
size = 1e4
set.seed(44)
df_list <- replicate(50, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
simple_bind <- function(list_of_dfs){
do.call(rbind, list_of_dfs)
}
dplyr_bind <- function(list_of_dfs){
bind_rows(list_of_dfs)
}
dt_bind <- function(list_of_dfs){
rbindlist(list_of_dfs, fill = F)
}
# Benchmark both methods
m16 <- microbenchmark(
dt_ver = dt_bind(df_list),
simple = simple_bind(df_list),
dplyr_ver = dplyr_bind(df_list),
times = 30
)
print(m16)
## Unit: microseconds
## expr min lq mean median uq max neval
## dt_ver 431.115 470.739 586.4407 512.6815 588.047 1976.799 30
## simple 488.862 519.830 611.7507 563.7670 623.764 1884.857 30
## dplyr_ver 10111.600 10384.979 10927.4029 10476.8610 10648.181 21415.835 30
fbox_plot(m16, "microseconds")
set.seed(123)
n <- 1e4
df <- data.table(text = paste("word1", "word2", "word3", "word4", "word5", sep = "."), stringsAsFactors = F)
df <- df[rep(1, n), , drop = F]
# Using tidyr::separate
separate_words <- function() {
df |>
separate(text, into = c("w1", "w2", "w3", "w4", "w5"), sep = "\\.", remove = F) |>
select(-c(w1, w2, w4))
}
# Using stringr::word
stringr_words <- function() {
df |>
mutate(
w3 = word(text, 3, sep = fixed(".")),
w5 = word(text, 5, sep = fixed("."))
)
}
datatable_words <- function() {
df |> _[, c("w3", "w5") := tstrsplit(text, "\\.")[c(3, 5)]]
}
m17 <- microbenchmark(
separate = separate_words(),
stringr = stringr_words(),
dt = datatable_words(),
times = 10
)
print(m17)
## Unit: milliseconds
## expr min lq mean median uq max neval
## separate 79.83310 83.29129 94.29407 94.72704 97.69480 117.03869 10
## stringr 181.41822 185.22161 215.12042 211.97258 230.81153 288.14400 10
## dt 12.30277 12.43595 12.91484 12.54286 12.65337 15.33692 10
fbox_plot(m17, "miliseconds")
# Sample data
set.seed(123)
n <- 1e6
df <- data.table(
x = rnorm(n),
y = sample(c(NA, 1:100), n, replace = TRUE),
z = sample(c(NA, letters), n, replace = TRUE),
stringsAsFactors = F
)
# Benchmark both methods
m18 <- microbenchmark(
dplyr_drop_na = {
df |> drop_na()
},
data_table_na_omit = {
dt |> na.omit()
},
times = 10
)
print(m18)
## Unit: microseconds
## expr min lq mean median uq max
## dplyr_drop_na 9365.868 9378.702 9593.9516 9417.189 9705.151 10220.052
## data_table_na_omit 8.756 9.097 51.8919 60.708 62.968 172.432
## neval
## 10
## 10
fbox_plot(m18, "microseconds")
# Sample data
set.seed(123)
size = 1e4
n_cores = parallelly::availableCores()
df_list <- replicate(100, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
extra_df <- data.table(id = sample(seq(size), size, replace = T),
extra_value = runif(size))
# Sequential join
sequential_join <- function() {
lapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
})
}
# Parallel join using mclapply
parallel_join <- function() {
mclapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
}, mc.cores = n_cores, mc.silent = T, mc.cleanup = T)
}
# Benchmark both methods
m19 <- microbenchmark(
sequential = sequential_join(),
parallel = parallel_join(),
times = 10
)
print(m19)
## Unit: milliseconds
## expr min lq mean median uq max neval
## sequential 593.2165 603.7509 655.2140 632.6488 693.8236 823.1743 10
## parallel 222.2954 241.5792 254.6547 248.7249 255.6517 325.9662 10
fbox_plot(m19, "miliseconds")
This is another alternative (You need to install this package)
set.seed(123)
n <- 1e7
df <- data.table(
group1 = sample(LETTERS[1:10], n, replace = TRUE),
group2 = sample(letters[1:5], n, replace = TRUE),
value1 = rnorm(n),
value2 = runif(n, 1, 100)
)
m21 <- microbenchmark(
basic_way = {
dplyr <- df |>
filter(value1 > 0) |>
mutate(ratio = value1 / value2) |>
summarize(
mean_val1 = mean(value1),
sd_val1 = sd(value1),
median_val2 = median(value2),
max_ratio = max(ratio), .by = c("group1", "group2")) |>
as.data.table()
},
dtplyr_way = {
dtplyr = df |>
lazy_dt() |>
filter(value1 > 0) |>
mutate(ratio = value1 / value2) |>
summarize(
mean_val1 = mean(value1),
sd_val1 = sd(value1),
median_val2 = median(value2),
max_ratio = max(ratio), .by = c("group1", "group2")) |>
as.data.table()
},
times = 5
)
print(m21)
## Unit: milliseconds
## expr min lq mean median uq max neval
## basic_way 515.0258 553.2352 611.6810 648.2386 659.3700 682.5351 5
## dtplyr_way 622.0624 640.6803 701.3499 657.3354 766.8708 819.8007 5
fbox_plot(m21, "miliseconds")
with_parquet <- function(){
fp_data <- "/conf/posit_azure_logs/data"
data_1 <- open_dataset(file.path(glue::glue("{fp_data}/golden_data_in_progress"))) |>
select(
date, hours, time,
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total
) |>
mutate(
computepool_node_mem = ALL_WIP_CP_node_total * (160 * 1024),
bigpool_node_mem = ALL_WIP_BP_node_total * (256 * 1024),
ALL_WIP_day_session = ALL_WIP_CP_day_session + ALL_WIP_BP_day_session,
ALL_WIP_night_session = ALL_WIP_CP_night_session + ALL_WIP_BP_night_session,
ALL_WIP_node_total = ALL_WIP_CP_node_total + ALL_WIP_BP_node_total,
total_mem_limit = ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit,
total_mem_request = ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request,
total_mem_max = ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max,
total_node_mem = computepool_node_mem + bigpool_node_mem,
average_session_per_node = ifelse(ALL_WIP_node_total != 0,
(ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total, 0)
) |>
collect() |>
as.data.table()
}
with_duckfile <- function(){
file.copy("/conf/posit_azure_logs/gatzos01/gd_inprogress.duckdb", "gd_inprogress.duckdb")
data_2 <- res_duckdb_sql <- dbGetQuery(
conn = dbConnect(duckdb::duckdb(), dbdir = "./gd_inprogress.duckdb"),
statement = glue("select date, hours, time,
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total,
ALL_WIP_CP_node_total * 160 * 1024 as computepool_node_mem,
ALL_WIP_BP_node_total * 256 * 1024 as bigpool_node_mem,
ALL_WIP_CP_day_session + ALL_WIP_BP_day_session as ALL_WIP_day_session,
ALL_WIP_CP_night_session + ALL_WIP_BP_night_session as ALL_WIP_night_session,
ALL_WIP_CP_node_total + ALL_WIP_BP_node_total as ALL_WIP_node_total,
ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit as total_mem_limit,
ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request as total_mem_request,
ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max as total_mem_max,
computepool_node_mem + bigpool_node_mem as total_node_mem,
CASE
WHEN ALL_WIP_node_total != 0 THEN (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total
ELSE 0
END AS average_session_per_node
from gdinprog"),
immediate = TRUE) |>
as.data.table()
file.remove("./gd_inprogress.duckdb")
}
m22 <- microbenchmark(
with_parquet = with_parquet(),
with_duckfile = with_duckfile(),
times = 3
)
print(m22)
## Unit: milliseconds
## expr min lq mean median uq max
## with_parquet 10369.535 18304.6670 21565.5895 26239.7988 27163.62 28087.434
## with_duckfile 501.711 644.0689 927.7635 786.4268 1140.79 1495.153
## neval
## 3
## 3
fbox_plot(m22, "miliseconds")